rm(list=ls()) 

###############################################################################
# EUREPOC PROCESSING SCRIPT
## creates following data files: 
## - eurepoc_year_attacks.RData
## - imputed_data_year.RData
## - ccode_data_controls.RData
###############################################################################

#load packages
library(tidyverse)
library(countrycode)
library(reshape2)
library(peacesciencer)
library(WDI)        
library(mice)
library(lubridate)
library(stringr)
library(MASS)
library(texreg)

#setwd 
setwd("/Users/cbj4/Desktop/replication")

# set seed 
# this seed must be set to 123 to get same coef in manuscript after mice imputation
set.seed(123)

##############################
#STEP 1: DATA LOAD AND CLEAN
##############################

#read data
EuRepoC <- read.csv("raw_data/EuRepoC_Global_Database_1.1-Table 1.csv")

#fix names
EuRepoC$initiator_country <- str_replace_all(EuRepoC$initiator_country, "Korea, Republic of", "Republic of Korea")
EuRepoC$initiator_country <- str_replace_all(EuRepoC$initiator_country, "Korea, Democratic People's Republic of", "Democratic People's Republic of Korea")
EuRepoC$initiator_country <- str_replace_all(EuRepoC$initiator_country, "Iran, Islamic Republic of", "Iran")

EuRepoC$initiator_country <- gsub("['']", "", EuRepoC$initiator_country )
EuRepoC$initiator_country <- gsub("\\[|\\]", "", EuRepoC$initiator_country)

EuRepoC$receiver_country <- str_replace_all(EuRepoC$receiver_country, "Korea, Republic of", "Republic of Korea")
EuRepoC$receiver_country <- str_replace_all(EuRepoC$receiver_country, "Korea, Democratic People's Republic of", "Democratic People's Republic of Korea")
EuRepoC$receiver_country <- str_replace_all(EuRepoC$receiver_country, "Iran, Islamic Republic of", "Iran")

EuRepoC$receiver_country <- gsub("['']", "", EuRepoC$receiver_country )
EuRepoC$receiver_country <- gsub("\\[|\\]", "", EuRepoC$receiver_country)

##############################
#STEP 2: FIX DATES
##############################

#create separate year, month, day columns 
EuRepoC <- separate(EuRepoC, start_date, sep = "-", into = c("start_year", "start_month", "start_day"))
EuRepoC <- separate(EuRepoC, end_date, sep = "-", into = c("end_year", "end_month", "end_day"))

### not every column has a year, month, day provided - separate function fills these with NAs
### some observations have blank start or end dates - separate function leaves these blank
### some observations have dates coded incorrectly (sequentially end year is sequentially prior to end year)

#change blank year cells to NAs 
EuRepoC$start_year <-ifelse(EuRepoC$start_year == "", NA, EuRepoC$start_year)
EuRepoC$end_year <-ifelse(EuRepoC$end_year == "", NA, EuRepoC$end_year)

#replace NA with 2018 (in EUREPOC codebook NA is given when attack is still ongoing)
EuRepoC$end_year <- ifelse(is.na(EuRepoC$end_year), 2018, EuRepoC$end_year) 
#correct end years that occur before start years
EuRepoC$end_year <- ifelse(EuRepoC$end_year < EuRepoC$start_year, EuRepoC$start_year, EuRepoC$end_year) 

#subset observations with date problems 
EuRepoC_check1 <- subset(EuRepoC, start_year > end_year | is.na(start_year) | is.na(end_year))
rm(EuRepoC_check1)

# Only two of the observations with date problems have non-state actor (non-state affiliated) attacks 
# I drop these observations.

#drop observations with start or end year NA
EuRepoC <- EuRepoC %>% drop_na(start_year)
EuRepoC <- EuRepoC %>% drop_na(end_year)

#drop observations with incorrect start end sequence (another check that all date issues were corrected)
EuRepoC <- subset(EuRepoC, start_year < end_year | start_year == end_year)

#subset to date range of study 
EuRepoC <- subset(EuRepoC, start_year < 2019)


##############################
#STEP 3: CREATE TARGET CCODES
##############################

#create separate target columns 
#the code cuts off at 10. One row has more than 10 targets but they are repeats of the same country. 
#this gets fixed in a later chunk when we remove multiple targets if they are in the same country. 
EuRepoC <- separate(EuRepoC, receiver_country, sep = ",", 
                    into = c("target1", "target2", "target3", "target4", 
                             "target5", "target6", "target7", "target8", 
                             "target9", "target10"))

### not every attacks has 10 targets - separate function fills target columns with with NAs

#create country codes for targets countries 
ccode_tar <- NA
ccode_matrix <- matrix(nrow = length(EuRepoC[,1]), ncol = 10)

for(i in 1:10){
  i <- i + 20
  ccode_tar <- countrycode(EuRepoC[,i],"country.name", "cown", nomatch = 0)
  i <- i - 20
  ccode_matrix[,i] <- ccode_tar
}

# add matrix of target ccodes into the original data 
EuRepoC <- cbind(EuRepoC, ccode_matrix)

#rename target ccode columns 
names(EuRepoC)[101:110] <- c("ccode_tar1", "ccode_tar2", "ccode_tar3", "ccode_tar4", 
                             "ccode_tar5", "ccode_tar6", "ccode_tar7", "ccode_tar8", 
                             "ccode_tar9", "ccode_tar10")

# create function to code non-country labels
label_func <- function(ccode_var, target_var){
  
  target_var <- str_trim(target_var, "right")
  target_var <- str_trim(target_var, "left")
  
  ccode_var <- ifelse(target_var == "NATO (region)", 995, ccode_var)
  ccode_var <- ifelse(target_var == "International Atomic Energy Agency", 995, ccode_var)
  ccode_var <- ifelse(target_var == "International Association of Athletics Federations", 995, ccode_var)
  ccode_var <- ifelse(target_var == "United Nations Environment Programme", 995, ccode_var)
  ccode_var <- ifelse(target_var == "United Nations Organization", 995, ccode_var)
  ccode_var <- ifelse(target_var == "International Monetary Fund", 995, ccode_var)
  ccode_var <- ifelse(target_var == "UNICEF", 995, ccode_var)
  ccode_var <- ifelse(target_var == "World Anti-Doping Agency", 995, ccode_var)
  ccode_var <- ifelse(target_var == "Interpol", 995, ccode_var)
  ccode_var <- ifelse(target_var == "United Nations", 995, ccode_var)
  ccode_var <- ifelse(target_var == "Organization for Security and Cooperation in Europe", 995, ccode_var)
  ccode_var <- ifelse(target_var == "United Nations Economic and Social Council", 995, ccode_var)
  ccode_var <- ifelse(target_var == "United Nations", 995, ccode_var)
  ccode_var <- ifelse(target_var == "Organization for Security and Cooperation in Europe", 995, ccode_var)
  
  ccode_var <- ifelse(target_var == "Global (region)", 999, ccode_var)
  ccode_var <- ifelse(target_var == "Africa", 999, ccode_var)
  ccode_var <- ifelse(target_var == "Central Asia (region)", 999, ccode_var)
  ccode_var <- ifelse(target_var == "Mena Region (region)", 999, ccode_var)
  ccode_var <- ifelse(target_var == "Central America (region)", 999, ccode_var)
  ccode_var <- ifelse(target_var == "Western Europe", 999, ccode_var)
  ccode_var <- ifelse(target_var == "Europe (region)", 999, ccode_var)
  ccode_var <- ifelse(target_var == "Southeast Asia (region)", 999, ccode_var)
  ccode_var <- ifelse(target_var == "Middle East (region)", 999, ccode_var)
  ccode_var <- ifelse(target_var == "Caucasus", 999, ccode_var)
  ccode_var <- ifelse(target_var == "North America", 999, ccode_var)
  ccode_var <- ifelse(target_var == "South Asia (region)", 999, ccode_var)
  ccode_var <- ifelse(target_var == "Eastern Europe", 999, ccode_var)
  ccode_var <- ifelse(target_var == "Asia (region)", 999, ccode_var)
  ccode_var <- ifelse(target_var == "EU (region)", 999, ccode_var)
  ccode_var <- ifelse(target_var == "Northern Europe", 999, ccode_var)
  ccode_var <- ifelse(target_var == "Eastern Asia (region)", 999, ccode_var)
  ccode_var <- ifelse(target_var == "Balkans (region)", 999, ccode_var)
  ccode_var <- ifelse(target_var == "South America", 999, ccode_var)
  
  ccode_var <- ifelse(target_var == "Hong Kong", 997, ccode_var)
  ccode_var <- ifelse(target_var == "Palestine", 997, ccode_var)
  ccode_var <- ifelse(target_var == "ISIS", 997, ccode_var)
  ccode_var <- ifelse(target_var == "Puerto Rico", 997, ccode_var)
  ccode_var <- ifelse(target_var == "Guernsey", 997, ccode_var)
  ccode_var <- ifelse(target_var == "Anguilla", 997, ccode_var)
  ccode_var <- ifelse(target_var == "Macao", 997, ccode_var)
  
  ccode_var <- ifelse(target_var == "Serbia", 345, ccode_var)
  
  return(ccode_var)
}

EuRepoC$ccode_tar1 <- label_func(EuRepoC$ccode_tar1, EuRepoC$target1)
EuRepoC$ccode_tar2 <- label_func(EuRepoC$ccode_tar2, EuRepoC$target2)
EuRepoC$ccode_tar3 <- label_func(EuRepoC$ccode_tar3, EuRepoC$target3)
EuRepoC$ccode_tar4 <- label_func(EuRepoC$ccode_tar4, EuRepoC$target4)
EuRepoC$ccode_tar5 <- label_func(EuRepoC$ccode_tar5, EuRepoC$target5)
EuRepoC$ccode_tar6 <- label_func(EuRepoC$ccode_tar6, EuRepoC$target6)
EuRepoC$ccode_tar7 <- label_func(EuRepoC$ccode_tar7, EuRepoC$target7)
EuRepoC$ccode_tar8 <- label_func(EuRepoC$ccode_tar8, EuRepoC$target8)
EuRepoC$ccode_tar9 <- label_func(EuRepoC$ccode_tar9, EuRepoC$target9)
EuRepoC$ccode_tar10 <- label_func(EuRepoC$ccode_tar10, EuRepoC$target10)

### create label of Int Org (995), Region (999), Territory (997)

#transform from wide to long
cyber_dat_long <- melt(EuRepoC, id.vars = c(1:100), 
                       measure.vars = c("ccode_tar1","ccode_tar2","ccode_tar3",
                                        "ccode_tar4", "ccode_tar5","ccode_tar6",
                                        "ccode_tar7","ccode_tar8", "ccode_tar9",
                                        "ccode_tar10"))

### transforming so that an incident is repeated in the data for each target
### 1202 incidents * 10 targets per attack = 12020 rows

# move target code to front of data and rename
cyber_dat_long <- cyber_dat_long[,c(ncol(cyber_dat_long),1:(ncol(cyber_dat_long)-1))]
names(cyber_dat_long)[1] <- "ccode_tar"

#remove rows when target code is equal to 0 or NA
cyber_dat_long <- subset(cyber_dat_long, ccode_tar != 0)

##############################
#STEP 4: CREATE ATTACK CCODES
##############################

#create separate attack columns 
cyber_dat_long <- separate(cyber_dat_long, 
                           initiator_country, sep = ",", 
                           into = c("attacker1", "attacker2", "attacker3", 
                                    "attacker4", "attacker5"))
### not every attacks has 5 attackers - separate function fills attack columns with with NAs

#create country codes for attack countries 
ccode_attack <- NA
ccode_matrix <- matrix(nrow = length(cyber_dat_long[,1]), ncol = 5)

for(i in 1:5){
  i <- i + 35
  ccode_attack <- countrycode(cyber_dat_long[,i],"country.name", "cown", nomatch = 0)
  i <- i - 35
  ccode_matrix[,i] <- ccode_attack
}

# add matrix of attack ccodes into the long data 
cyber_dat_long <- cbind(cyber_dat_long, ccode_matrix)

#rename target ccode columns 
names(cyber_dat_long)[107:111] <- c("ccode_attack1", "ccode_attack2", "ccode_attack3",
                                    "ccode_attack4", "ccode_attack5")

#add Serbia attack ccode in manually 
cyber_dat_long$ccode_attack1 <- ifelse(cyber_dat_long$attacker1 == "Serbia", 345, cyber_dat_long$ccode_attack1)

### after checking there are no more ccodes that need to be manually changed for attackers

##############################
#STEP 5: CREATE DIRECTED DYADS
##############################
ccode_dyad <- NA
ccode_matrix <- matrix(nrow = length(cyber_dat_long[,1]), ncol = 5)

for(i in 1:5){
  i <- i + 106
  ccode_dyad <- paste(cyber_dat_long[,1], cyber_dat_long[,i], sep="_")
  i <- i - 106
  ccode_matrix[,i] <- ccode_dyad
}

# add matrix of direct dyads into the long data 
cyber_dat_long <- cbind(cyber_dat_long, ccode_matrix)

#rename dyad columns 
names(cyber_dat_long)[112:116] <- c("dyad1","dyad2","dyad3","dyad4", "dyad5")

#transform again so that each dyad has a row
cyber_dat_long_dyad <- melt(cyber_dat_long, id.vars = c(1:111), 
                            measure.vars = c("dyad1","dyad2","dyad3",
                                             "dyad4", "dyad5"))
#move dyad code to front of data and rename 
cyber_dat_long_dyad <- cyber_dat_long_dyad[,c(ncol(cyber_dat_long_dyad),1:(ncol(cyber_dat_long_dyad)-1))]
names(cyber_dat_long_dyad)[1] <- "dyad"

#separate dyad to get attacker code in single colum
cyber_dat_long_dyad <- separate(cyber_dat_long_dyad, col = dyad, 
                                into = c("ccode1", "ccode2"), sep = "_")
names(cyber_dat_long_dyad)[2] <- "ccode_attack"

#remove extra attack ccode column and drop all attacks without ccode info
cyber_dat_long_dyad <- subset(cyber_dat_long_dyad, ccode_attack != 0, select = c(2:114))

cyber_dat_long_dyad$ccode_attack <- as.numeric(cyber_dat_long_dyad$ccode_attack)
cyber_dat_long_dyad$ccode_tar <- as.numeric(cyber_dat_long_dyad$ccode_tar)

#remove duplicate rows from countries listed twice as initiators (countries are
#list more than once if more than 1 group from that country is involved in the attack)
cyber_dat_long_dyad <- distinct(cyber_dat_long_dyad)

##############################
#STEP 6: EXPAND YEARS
##############################
cyber_dat_long_dyad$end_year <- as.numeric(cyber_dat_long_dyad$end_year)
cyber_dat_long_dyad$start_year <- as.numeric(cyber_dat_long_dyad$start_year)

cyber_dat_long_dyad %>%
  rowwise() %>%
  mutate(year = list(seq(start_year, end_year))) %>%
  unnest(cols = c(year)) -> cyber_DDY

##############################
#STEP 7: CREATE ATTACK COUNTS
##############################

#start with all attacks (state and non-state)
#create attacker-year variable
cyber_DDY$attacker_year <- paste(cyber_DDY$ccode_attack, cyber_DDY$year, sep="_")
length(unique(cyber_DDY$attacker_year))

cyber_DDY <- subset(cyber_DDY, year < 2023)

#create an attack count for each unique attacker year
cyber_DDY$attack_bin <- rep(1)
unique_attacker_year <- aggregate(cyber_DDY$attack_bin, list(cyber_DDY$attacker_year), sum)
unique_attacker_year <- separate(unique_attacker_year, col = Group.1, 
                                 into = c("ccode_attack", "year"), sep = "_")
names(unique_attacker_year)[3] <- "attacks"

#second create non-state attacks 
#create binary for whether an attack had a non-state attacker
cyber_DDY$nonstate <- ifelse(grepl("Non-state", cyber_DDY$initiator_category), 1, 0)
cyber_DDY$nonstate <- ifelse(grepl("Individual", cyber_DDY$initiator_category), 1, cyber_DDY$nonstate)
#cyber_DDY$nonstate <- ifelse(grepl("Unknown", cyber_DDY$initiator_category), 1, cyber_DDY$nonstate)

#create binary for whether an attacker was a state or had suspected state affiliation 
cyber_DDY$state <- ifelse(grepl("State", cyber_DDY$initiator_category), 1, 0)
cyber_DDY$state <- ifelse(grepl("state-affiliated", cyber_DDY$inclusion_criteria_subcode), 1, cyber_DDY$state)
cyber_DDY$state <- ifelse(grepl("state-affiliation", cyber_DDY$initiator_category), 1, cyber_DDY$state)
cyber_DDY$state <- ifelse(grepl("state-attribution", cyber_DDY$inclusion_criteria), 1, cyber_DDY$state)

# subset attacks based on non-state actor criteria 
#cyber_DDY <- subset(cyber_DDY, INTRA != 1)
cyber_DDY <- subset(cyber_DDY, nonstate == 1)
cyber_DDY <- subset(cyber_DDY, state != 1)

# create non-state attack count 
cyber_DDY$disrupt_bin <- rep(1)
unique_attacker_year_disrupt <- aggregate(cyber_DDY$disrupt_bin, list(cyber_DDY$attacker_year), sum)
unique_attacker_year_disrupt <- separate(unique_attacker_year_disrupt, col = Group.1, 
                                         into = c("ccode_attack", "year"), sep = "_")
names(unique_attacker_year_disrupt)[3] <- "disruption_attacks"


################################################
# STEP 8: CREATE BASE COUNTRY-YEAR ATTACK DATA
################################################

#create country-year base
country_base <- create_stateyears(system = "cow", mry = T) %>%
  filter(between(year, 2000, 2022)) 

#%>% add_cow_majors() 

### this function has a bug 
### it adds 1s for every country as a COW MAJ in 2017 and 2018

#fix cow major dummy for 2017 and 2018 
#country_base$cowmaj <- ifelse(country_base$year == 2017, 0, country_base$cowmaj)
#country_base$cowmaj <- ifelse(country_base$year == 2018, 0, country_base$cowmaj)
#country_base$cowmaj <- ifelse(country_base$ccode == 2 |country_base$ccode == 200 |
#country_base$ccode == 222 |country_base$ccode == 255 |
#country_base$ccode == 365 |country_base$ccode == 710 |
#country_base$ccode == 740, 1, country_base$cowmaj)

# create time until the next election variable
country_base$time_to_elect <- rep(0)
country_base$time_to_elect <- ifelse(country_base$year == 2021 | country_base$year == 2017 | country_base$year == 2013 | 
                                       country_base$year == 2009 | country_base$year == 2005 |
                                       country_base$year == 2001, 1, country_base$time_to_elect)
country_base$time_to_elect <- ifelse(country_base$year == 2020 |
                                       country_base$year == 2016 | country_base$year == 2012 | 
                                       country_base$year == 2008 | country_base$year == 2004 |
                                       country_base$year == 2000, 2, country_base$time_to_elect)
country_base$time_to_elect <- ifelse(country_base$year == 2019 |
                                       country_base$year == 2015 | country_base$year == 2011 | 
                                       country_base$year == 2007 | country_base$year == 2003, 
                                     3, country_base$time_to_elect)

#create dummy variable for each time to elect category (1-4)
country_base$T_0 <- ifelse(country_base$time_to_elect == 0, 1, 0)
country_base$T_1 <- ifelse(country_base$time_to_elect == 1, 1, 0)
country_base$T_2 <- ifelse(country_base$time_to_elect == 2, 1, 0)
country_base$T_3 <- ifelse(country_base$time_to_elect == 3, 1, 0)

#create T variable (linear time)
country_base$time <- (country_base$year + 1) - 2000

#merge base with attack counts 
country_year_attacks <- merge(country_base, unique_attacker_year, 
                              by.x = c("ccode", "year"), by.y = c("ccode_attack", "year"), 
                              all.x = T)
country_year_attacks <- merge(country_year_attacks, unique_attacker_year_disrupt, 
                              by.x = c("ccode", "year"), by.y = c("ccode_attack", "year"), 
                              all.x = T)
country_year_attacks$attacks[is.na(country_year_attacks$attacks)] <- 0
country_year_attacks$disruption_attacks[is.na(country_year_attacks$disruption_attacks)] <- 0

# read in percent usage (control) for every country year 
ITU_percent <- read.csv("raw_data/ITU_percent.csv")
#ITU_percent <- read.csv("~/My Drive /Cyber proxies - ITU /Data - Controls/ITU_percent.csv")

#add ccodes to percent usage data
ITU_percent$ccode <- countrycode(ITU_percent$Economy, 
                                 origin = "country.name", destination = "cown", 
                                 nomatch = 0)

# manually add ccode for countries not matched by function 
ITU_percent$ccode <- ifelse(ITU_percent$Economy == "Serbia", 345, ITU_percent$ccode)
ITU_percent$ccode <- ifelse(ITU_percent$Economy == "Türkiye", 640, ITU_percent$ccode)

#transform percent usage from wide to long
ITU_percent_long <- melt(ITU_percent, id.vars = c(1:69), 
                         measure.vars = c("X2000_value", "X2001_value", 
                                          "X2002_value", "X2003_value", 
                                          "X2004_value", "X2005_value", 
                                          "X2006_value", "X2007_value",
                                          "X2008_value", "X2009_value", 
                                          "X2010_value", "X2011_value", 
                                          "X2012_value", "X2013_value", 
                                          "X2014_value", "X2015_value", 
                                          "X2016_value", "X2017_value", 
                                          "X2018_value", "X2019_value", 
                                          "X2020_value", "X2021_value"))

#create a year variable from the column names
ITU_percent_long$year <- as.numeric(gsub('[X_value]','',ITU_percent_long$variable))
ITU_percent_long <- subset(ITU_percent_long, select = c(69,71,72))
names(ITU_percent_long)[2] <- "percent_usage"

#merge percent usage with base country-year attacks data
country_year_attacks <- merge(country_year_attacks, ITU_percent_long, by = c("ccode", "year"), all.x = T)

#########################################
# STEP 9: CREATE ITU CANDIDATE TREATMENT
##########################################

# read in ITU council and ITU radio board data
ITU <- read.csv("raw_data/ITU.csv")
#ITU <- read.csv("ITU.csv")

#load("~/My Drive/Cyber proxies - ITU /Data - ITU/ITU.RData")
#council_ITU <- ITU

#RB_ITU <- read.csv("RB_ITU.csv")
RB_ITU <- read.csv("raw_data/RB_ITU.csv")

names(RB_ITU)[1] <- "state"
names(RB_ITU)[4] <- "elect_year"

#start with elect year 2002
ITU <- subset(ITU, elect_year > 2000)
RB_ITU <- subset(RB_ITU, elect_year > 2000)
ITU_merge <- rbind(ITU, RB_ITU)

#add ccodes to ITU data
ITU_merge$ccode <- countrycode(ITU_merge$state, 
                               "country.name", "cown", nomatch = 0)

#add ccodes for states not matched by function
ITU_merge$ccode <- ifelse(ITU_merge$state == "United  States", 2, ITU_merge$ccode)
ITU_merge$ccode <- ifelse(ITU_merge$state == "Serbia", 345, ITU_merge$ccode)

#create unique ITU candidate years 
ITU_merge$ccode_year <- paste(ITU_merge$ccode, ITU_merge$elect_year, sep="_")
unique_ITU_year <- as.data.frame(table(ITU_merge$ccode_year))
unique_ITU_year <- separate(unique_ITU_year, col = Var1, into = c("ccode", "year"), 
                            sep = "_")
unique_ITU_year <- subset(unique_ITU_year, select = c(1,2))
unique_ITU_year$candidate <- rep(1)

#expand candidate years to include years prior to election and create candidate binary
#for merge into base data

unique_ITU_year$year <- as.numeric(unique_ITU_year$year)

year_subtract1 <- function(y, x){
  ccode <- as.data.frame(print(y))
  year <- as.data.frame(print(x - 1))
  df1 <- data.frame(ccode, year)
  colnames(df1) <- c("ccode","year")
  return(df1)
}

df1 <- year_subtract1(unique_ITU_year$ccode, unique_ITU_year$year)
df1$candidate <- rep(1)

year_subtract2 <- function(y, x){
  ccode <- as.data.frame(print(y))
  year <- as.data.frame(print(x - 2))
  df2 <- data.frame(ccode, year)
  colnames(df2) <- c("ccode","year")
  return(df2)
}

df2 <- year_subtract2(unique_ITU_year$ccode, unique_ITU_year$year)
df2$candidate <- rep(1)

year_subtract3 <- function(y, x){
  ccode <- as.data.frame(print(y))
  year <- as.data.frame(print(x - 3))
  df3 <- data.frame(ccode, year)
  colnames(df3) <- c("ccode","year")
  return(df3)
}

df3 <- year_subtract3(unique_ITU_year$ccode, unique_ITU_year$year)
df3$candidate <- rep(1)

#combine all years into one data set and exclude years prior to 2000
SY_ITU_dat <- rbind(unique_ITU_year, df1, df2, df3)
SY_ITU_dat <- subset(SY_ITU_dat, year > 1999)

#merge ITU candidate status into country year attack base data 
country_year_attacks <- merge(country_year_attacks, SY_ITU_dat, by = c("ccode", "year"), all.x = T)
country_year_attacks$candidate[is.na(country_year_attacks$candidate)] <- 0

#create spell factors for within analysis 

country_year_attacks$spell <- ifelse(country_year_attacks$time < 4, 1, 0)
country_year_attacks$spell <- ifelse(country_year_attacks$time < 8 & country_year_attacks$time > 3 , 2, country_year_attacks$spell)
country_year_attacks$spell <- ifelse(country_year_attacks$time < 12 & country_year_attacks$time > 7 , 3, country_year_attacks$spell)
country_year_attacks$spell <- ifelse(country_year_attacks$time < 16 & country_year_attacks$time > 11 , 4, country_year_attacks$spell)
country_year_attacks$spell <- ifelse(country_year_attacks$time < 20 & country_year_attacks$time > 15 , 5, country_year_attacks$spell)
country_year_attacks$spell <- ifelse(country_year_attacks$time > 19, 6, country_year_attacks$spell)

country_year_attacks$country_spell <- paste(country_year_attacks$ccode, country_year_attacks$spell, sep = "_")
country_year_attacks$country_spell <- factor(country_year_attacks$country_spell)

#######################################################################
# STEP 10: CREATE FINAL "eurepoc_year_attacks.RData" DATA SAVE AS RData
#######################################################################

# rename dataset
eurepoc_year_data <- country_year_attacks
#rename treatment variable 
eurepoc_year_data$election <- eurepoc_year_data$T_0
#remove all attacks variable 
eurepoc_year_data <- subset(eurepoc_year_data, select = -c(attacks))
#rename non-state attacks outcome variable 
names(eurepoc_year_data)[10] <- "attacks"
#save(eurepoc_year_data, file = "~/My Drive/Cyber proxies - ITU /R&R/eurepoc_year_attacks.RData")

##########################################
# STEP 11: CHECK MODEL 1
##########################################

#create within candidate data 
SY_within <- subset(eurepoc_year_data, candidate == 1)
SY_within <- subset(SY_within, year < 2019)

#create all country data 
eurepoc_year_data <- subset(eurepoc_year_data, year < 2019)

#create treatment variable as function of candidate and election year
eurepoc_year_data$treat <- ifelse(eurepoc_year_data$candidate == 1 & eurepoc_year_data$election == 1, 1, 0)

# Model 1: Subset non-candidates
m1 <- lm(attacks ~ treat + factor(country_spell), data = eurepoc_year_data)
summary(m1)

##########################################
# STEP 12: ADD CONTROLS W/ IMPUTATION
##########################################

# create country year base 
ccode_years <- create_stateyears(system = "cow", mry = T) %>%
  filter(between(year, 2000, 2018)) 

ccode_vector <- unique(eurepoc_year_data$ccode)
ccode_years <- ccode_years[(ccode_years$ccode %in% ccode_vector),]

# check country number
length(unique(ccode_years$ccode))

# add model variables into base
model <- subset(eurepoc_year_data, select = c("ccode", "year", "candidate", "attacks", "spell", "country_spell", 
                                              "T_0", "T_1", "T_2", "T_3", "election", "time", "time_to_elect", 
                                              "treat"))

ccode_years <- merge(ccode_years, model, by = c("ccode", "year"), all.x = T)

ccode_years$candidate[is.na(ccode_years$candidate)] <- 0

# check that number of candidate years matches
table(ccode_years$candidate)

#add WDI controls data 
controls <- WDI(
  country = "all",
  indicator = c("NY.GDP.PCAP.KD","BX.GSR.CCIS.ZS", "TM.VAL.ICTG.ZS.UN", "IP.JRN.ARTC.SC", "IT.NET.USER.ZS", 
                "IT.NET.BBND.P2", "IT.CEL.SETS.P2", "SI.POV.DDAY"),
  start = 2000,
  end = 2018,
  extra = FALSE,
  cache = NULL,
  latest = NULL,
  language = "en"
)

controls$ccode <- countrycode(controls$iso3c, "iso3c", "cown")
controls$ccode <- ifelse(controls$iso3c == "SRB", 345, controls$ccode)

controls <- subset(controls, select = c(4:13))
names(controls)[2:9] <- c("gdp_percap", "ict_exports", "ict_imports", "tech_articles", "internet_usage", "broadband_subs", "mobile_subs", "extreme_poverty")

# Add international bandwidth 
int_bandwidth <- read.csv("raw_data/international-bandwidth.csv")

int_bandwidth$ccode <- countrycode(int_bandwidth$isoCode, origin = "iso3c", destination = "cown")
int_bandwidth$ccode <- ifelse(int_bandwidth$isoCode == "SRB", 345, int_bandwidth$ccode)

names(int_bandwidth)[8] <- "year"
int_bandwidth <- subset(int_bandwidth, select = c(8,11, 12))
names(int_bandwidth)[2] <- "bandwidth"

# Add annual investment in tele services 
tele_invest <- read.csv("raw_data/annual-investment-in-telecommunication-services.csv")

tele_invest$ccode <- countrycode(tele_invest$isoCode, origin = "iso3c", destination = "cown")
tele_invest$ccode <- ifelse(tele_invest$isoCode == "SRB", 345, tele_invest$ccode)

names(tele_invest)[8] <- "year"
tele_invest <- subset(tele_invest, select = c(8,11, 12))
names(tele_invest)[2] <- "tele_invest"

# merge with controls 
ccode_years <- merge(ccode_years, controls, by = c("ccode", "year"), all.x = T)
ccode_years <- merge(ccode_years, int_bandwidth, by = c("ccode", "year"), all.x = T)
ccode_years <- merge(ccode_years, tele_invest, by = c("ccode", "year"), all.x = T)

#######################################################################
# STEP 13: CREATE FINAL "ccode_data_controls.RData" DATA SAVE AS RData
#######################################################################

#save(ccode_years, file = "data/ccode_data_controls.RData")

###################################
# STEP 14: CREATE IMPUTED VALUES
##################################
predM <- quickpred(ccode_years, exclude = c("ccode","year", "statenme", "candidate", "spell", "election", "treat", 
                                            "country_spell", "T_0", "T_1", "T_2", "T_3", "time", "time_to_elect"),
                   method = "pearson")


impute_data <- ccode_years
impute_data$ccode <- factor(impute_data$ccode)
impute_data$year <- factor(impute_data$year)

#imp0 <- mice(impute_data,maxit = 0)

#predM <- imp0$predictorMatrix

#predM[, c("ccode")] <- 0
#predM[, c("year")] <- 0
#predM[, c("time_to_elect")] <- 0
#predM[, c("T_0")] <- 0
#predM[, c("T_1")] <- 0
#predM[, c("T_2")] <- 0
#predM[, c("T_3")] <- 0
#predM[, c("time")] <- 0
#predM[, c("attacks")] <- 0
#predM[, c("spell")] <- 0
#predM[, c("country_spell")] <- 0
#predM[, c("candidate")] <- 0
#predM[, c("election")] <- 0
#predM[, c("statenme")] <- 0
#predM[, c("broadband_subs")] <- 0

imp1 <- mice(impute_data, maxit = 5, 
             predictorMatrix = predM, print =  TRUE, method = "cart", remove_collinear = TRUE )

imp1$loggedEvents
#summary(imp1) 
d.long <- complete(imp1,"long",include = T)

#######################################################################
# STEP 15: CREATE FINAL "imputed_data_year.RData" DATA SAVE AS RData
#######################################################################

#save(d.long, file = "data/imputed_data_year.RData")
#save(d.long, file = "data/imputed_data_year.RData")

###########################
# STEP 16: CHECK MODEL 2
###########################

imp.candidates <- as.mids(d.long)

fit.A <- with(imp.candidates, lm(attacks ~ treat + factor(country_spell) +
                                   internet_usage + ict_exports + log_tech_articles))
m2 <- pool(fit.A)
summary(m2)